$\mathbf{GAUSSIAN (NORMAL)}$: $P(x;\mu,\sigma)=\displaystyle \frac{1}{\sqrt{2 \pi \sigma^2}} \exp{\displaystyle \left( -\frac{(x-\mu)^2}{2 \sigma^2} \right) }, \hspace{1in} x \in [-\infty;\infty]$
Standard Deviations
In [18]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
#u -> mean
#s -> standard deviation
x=np.arange(-20,20,0.1)
###############
# Excercise 1.1
###############
plt.subplot(1,2,1)
u = 0
s = 1
###############
# Excercise 1.2
###############
u = 0
s = 5
###############
# Excercise 1.3
###############
plt.subplot(1,2,2)
u = 5
s = 1
###############
# Excercise 1.4
###############
u = 5
s = 5
Out[18]:
Take the mean of $n$ random samples from ANY arbitrary distribution with a well defined standard deviation $\sigma$ and mean $\mu$. As $n$ gets bigger the distribution of the sample mean will always converge to a Gaussian (normal) distribution with mean $\mu$ and standard deviation $\sigma/\sqrt{n}$.
Basically, the theorem states that the average (or sum) of a set of random measurements will tend to a bell-shaped curve no matter the shape of the original meaurement distribution. This explains the ubiquity of the Gaussian distribution in science and statistics.
In [21]:
import pandas
import pandasql
# Read in our aadhaar_data csv to a pandas dataframe and print the average age of participants
aadhaar_data = pandas.read_csv('aadhaar_data.csv')
aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)
In [22]:
nTrials = 10
nSamples = 100
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,1)
plt.hist(sampleArray,bins = 30)
plt.title('Trials = 10')
plt.xlim([20,40])
nTrials = 50
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,2)
plt.hist(sampleArray,bins = 30)
plt.title('Trials = 50')
plt.xlim([20,40])
nTrials = 100
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,3)
plt.hist(sampleArray,bins = 30)
plt.title('Trials = 100')
plt.xlim([20,40])
nTrials = 500
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,4)
plt.hist(sampleArray,bins = 30)
plt.title('Trials = 500')
plt.xlim([20,40])
Out[22]:
In [4]:
nTrials = 100
################
# Samples = 100
################
nSamples = 100
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,1)
plt.hist(sampleArray,bins = 30)
plt.title('nSamples = 100')
plt.xlim([20,40])
###############
# Samples = 500
###############
nSamples = 500
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,2)
plt.hist(sampleArray,bins = 30)
plt.title('nSamples = 500')
plt.xlim([20,40])
################
# Samples = 1000
################
nSamples = 1000
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,3)
plt.hist(sampleArray,bins = 30)
plt.title('nSamples = 1000')
plt.xlim([20,40])
################
# Samples = 2000
################
nSamples = 2000
sampleArray = np.empty(nTrials)
ageData = aadhaar_data['age']
for trial in range(nTrials):
p = np.random.permutation(ageData.size)
mn = np.mean(ageData[p[:nSamples]])
sampleArray[trial]= mn
plt.subplot(2,2,4)
plt.hist(sampleArray,bins = 30)
plt.title('nSamples = 2000')
plt.xlim([20,40])
Out[4]:
In [8]:
import numpy as np
import scipy.stats as stats
import pandas
# Performs a t-test on two sets of baseball data (left-handed and right-handed hitters).
# You are given a csv file that has three columns. A player's name,
# handedness (L for lefthanded or R for righthanded) and their
# career batting average (called 'avg').
# Read the csv file into a pandas data frame,and run Welch's t-test on the two
# cohorts defined by handedness.
# One cohort should be a data frame of right-handed batters.
# And the other cohort the left-handed batters.
# Tasks
# (1) Print the mean of right-handed and left-handed batting averages
# (2) Using p critical = 0.5
# - print "There is a significant difference" followed by values for t and p
# when the p value shows significance otherwise
# print "There is NO significant difference" followed by values for t and p
# For example, you may have
# "There is a signficant difference"
# t = 9.93570222
# p = 0.000023
baseball_data = pandas.read_csv('baseball_data.csv')
left_handed = baseball_data['avg'][baseball_data['handedness'] == 'L']
right_handed = baseball_data['avg'][baseball_data['handedness'] == 'R']
print np.mean(left_handed)
print np.mean(right_handed)
t,p = stats.ttest_ind(left_handed, right_handed, equal_var=False)
if p <= 0.05:
print "DERE IZ A SIGNIFAICANT DIFARENCE"
else:
print "DERE NO IZ A SIGNIFACANT DIFARENCE"
print "I IZ A G00d S173LLAR"
In [13]:
# Consume the turnstile_weather data into a dataframe.
# You should print
# (1) the mean number of entries ('ENTRIESn_hourly') with rain
# (2) the mean number of entries without rain
# (3) get the Mann-Whitney U statistic and p-value
# (4) with 95% significance level, print if there is a significant difference
# Your code goes here
master = pandas.read_csv("turnstile_data_master_with_weather.csv")
rain = master['EXITSn_hourly'][master['rain']==1]
no_rain = master['EXITSn_hourly'][master['rain']!=1]
print 'entries with rain = ', np.mean(rain)
print 'entries with no rain = ', np.mean(no_rain)
u,p = stats.mannwhitneyu(rain,no_rain)
if p <= 0.05:
print 'omg it significant'
else:
print 'oh nose it no significant'
In [13]:
In [ ]: